library(tidyverse)
library(scales)
library(ggrepel)
theme_set(theme_light())

LETS LOOK AT SOME GRADUATES YO

recent_grads <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-10-16/recent-grads.csv")

── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_double(),
  Major = col_character(),
  Major_category = col_character()
)
ℹ Use `spec()` for the full column specifications.
majors_processed <- recent_grads %>%
  arrange(desc(Median)) %>%
  mutate(Major = str_to_title(Major), Major = fct_reorder(Major, Median)) 

#Category of majors

majors_processed %>%
  mutate(Major_category = fct_reorder(Major_category, Median)) %>%
  ggplot(aes(Major_category, Median, fill = Major_category)) +
  geom_boxplot() +
  scale_y_continuous(labels = dollar_format()) +
  coord_flip() +
  expand_limits(y=0) +
  theme(legend.position = "none")

#Highest earning majors

majors_processed <- recent_grads %>%
  arrange(desc(Median)) %>%
  select(Major, Major_category, Median, P25th, P75th, Sample_size) %>%
  mutate(Major = str_to_title(Major), Major = fct_reorder(Major, Median)) 
majors_processed %>%
  filter(Sample_size >=100) %>%
  head(20) %>%
  ggplot(aes(Major,Median, color = Major_category)) +
  geom_point() +
  geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
  coord_flip() +
  labs(title="Highest earning Majors?",
       subtitle = "Top 20 Majors with at least 100 students surveyed, bars are 25th and 75th percentile",
       x="",
       y="Median Salary of Graduates")

#Lowest earning majors

majors_processed %>%
  tail(20) %>%
  mutate(Major = str_to_title(Major), Major = fct_reorder(Major, Median)) %>%
  ggplot(aes(Major,Median, color = Major_category)) +
  geom_point() +
  geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
  coord_flip()

majors_processed %>%
  ggplot(aes(Sample_size, Median)) +
  geom_point() +
  geom_text(aes(label=Major, check_overlap=TRUE,vjust=1, hjust=1)) +
  scale_x_log10()
Ignoring unknown aesthetics: check_overlap

Most common majors

majors_processed %>%
  mutate(Major_category = fct_reorder(Major, Total)) %>%
  arrange(desc(Total)) %>%
  head(20) %>%
  ggplot(aes(Major_category, Total, fill=Major_category)) +
  geom_col() +
  coord_flip() +
  labs(x="",
       y = "Total Graduates #") +
  scale_y_continuous(labels = comma_format()) +
  theme(legend.position="none")

How does gender breakdown relate to earnings

majors_processed %>%
  arrange(desc(Total)) %>%
  head(20) %>%
  mutate(Major = fct_reorder(Major,Total)) %>%
  gather(Gender, Number, Men, Women) %>%
  select(Major, Gender, Number) %>%
  ggplot(aes(Major, Number, fill = Gender)) +
  geom_col() +
  coord_flip()

by_major_category <- majors_processed %>%
  filter(!is.na(Total)) %>%
  group_by(Major_category) %>%
  summarize(Men = sum(Men),
            Women =sum(Women),
            Total = sum(Total),
            MedianSalary = sum( Median * Sample_size) / sum(Sample_size)) %>%
  mutate(ShareWomen = Women / Total) %>%
  arrange(desc(ShareWomen))
`summarise()` ungrouping output (override with `.groups` argument)
library(plotly)
g<- by_major_category %>%
  ggplot(aes(ShareWomen, MedianSalary, color=Major_category)) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_x_continuous(label = percent_format())+
  scale_y_continuous(label = dollar_format()) +
  expand_limits(y=0)
ggplotly(g)
`geom_smooth()` using formula 'y ~ x'
majors_processed %>%
  select(Major, Total, ShareWomen, Sample_size, Median) %>%
  lm(Median ~ ShareWomen, data = ., weights = Sample_size) %>%
  summary()

Call:
lm(formula = Median ~ ShareWomen, data = ., weights = Sample_size)

Weighted Residuals:
    Min      1Q  Median      3Q     Max 
-260500  -61042  -13899   33262  865081 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)    52073       1436  36.255   <2e-16 ***
ShareWomen    -23650       2403  -9.842   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 123000 on 170 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.363, Adjusted R-squared:  0.3592 
F-statistic: 96.87 on 1 and 170 DF,  p-value: < 2.2e-16
library(broom)
majors_processed %>%
  select(Major, Major_category, Total, ShareWomen, Sample_size, Median) %>%
  add_count(Major_category) %>%
  filter(n>=10) %>%
  nest(-Major_category) %>%
  mutate(model = map(data, ~lm(Median ~ ShareWomen, data = ., weights = Sample_size)),
         tidied = map(model,tidy)) %>%
  unnest(tidied) %>%
  filter(term == "ShareWomen") %>%
  arrange(estimate) %>%
  mutate(fdr = p.adjust(p.value, method = "fdr"))
All elements of `...` must be named.
Did you want `data = c(Major, Total, ShareWomen, Sample_size, Median, n)`?
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQpgYGB7cn0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoc2NhbGVzKQpsaWJyYXJ5KGdncmVwZWwpCgp0aGVtZV9zZXQodGhlbWVfbGlnaHQoKSkKYGBgCgojIExFVFMgTE9PSyBBVCBTT01FIEdSQURVQVRFUyBZTwpgYGB7cn0KcmVjZW50X2dyYWRzIDwtIHJlYWRfY3N2KCJodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vcmZvcmRhdGFzY2llbmNlL3RpZHl0dWVzZGF5L21hc3Rlci9kYXRhLzIwMTgvMjAxOC0xMC0xNi9yZWNlbnQtZ3JhZHMuY3N2IikKCm1ham9yc19wcm9jZXNzZWQgPC0gcmVjZW50X2dyYWRzICU+JQogIGFycmFuZ2UoZGVzYyhNZWRpYW4pKSAlPiUKICBtdXRhdGUoTWFqb3IgPSBzdHJfdG9fdGl0bGUoTWFqb3IpLCBNYWpvciA9IGZjdF9yZW9yZGVyKE1ham9yLCBNZWRpYW4pKSAKYGBgCiNDYXRlZ29yeSBvZiBtYWpvcnMKYGBge3J9Cm1ham9yc19wcm9jZXNzZWQgJT4lCiAgbXV0YXRlKE1ham9yX2NhdGVnb3J5ID0gZmN0X3Jlb3JkZXIoTWFqb3JfY2F0ZWdvcnksIE1lZGlhbikpICU+JQogIGdncGxvdChhZXMoTWFqb3JfY2F0ZWdvcnksIE1lZGlhbiwgZmlsbCA9IE1ham9yX2NhdGVnb3J5KSkgKwogIGdlb21fYm94cGxvdCgpICsKICBzY2FsZV95X2NvbnRpbnVvdXMobGFiZWxzID0gZG9sbGFyX2Zvcm1hdCgpKSArCiAgY29vcmRfZmxpcCgpICsKICBleHBhbmRfbGltaXRzKHk9MCkgKwogIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbiA9ICJub25lIikKYGBgCiNIaWdoZXN0IGVhcm5pbmcgbWFqb3JzCmBgYHtyfQptYWpvcnNfcHJvY2Vzc2VkIDwtIHJlY2VudF9ncmFkcyAlPiUKICBhcnJhbmdlKGRlc2MoTWVkaWFuKSkgJT4lCiAgc2VsZWN0KE1ham9yLCBNYWpvcl9jYXRlZ29yeSwgTWVkaWFuLCBQMjV0aCwgUDc1dGgsIFNhbXBsZV9zaXplKSAlPiUKICBtdXRhdGUoTWFqb3IgPSBzdHJfdG9fdGl0bGUoTWFqb3IpLCBNYWpvciA9IGZjdF9yZW9yZGVyKE1ham9yLCBNZWRpYW4pKSAKCm1ham9yc19wcm9jZXNzZWQgJT4lCiAgZmlsdGVyKFNhbXBsZV9zaXplID49MTAwKSAlPiUKICBoZWFkKDIwKSAlPiUKICBnZ3Bsb3QoYWVzKE1ham9yLE1lZGlhbiwgY29sb3IgPSBNYWpvcl9jYXRlZ29yeSkpICsKICBnZW9tX3BvaW50KCkgKwogIGdlb21fZXJyb3JiYXIoYWVzKHltaW4gPSBQMjV0aCwgeW1heCA9IFA3NXRoKSkgKwogIGNvb3JkX2ZsaXAoKSArCiAgbGFicyh0aXRsZT0iSGlnaGVzdCBlYXJuaW5nIE1ham9ycz8iLAogICAgICAgc3VidGl0bGUgPSAiVG9wIDIwIE1ham9ycyB3aXRoIGF0IGxlYXN0IDEwMCBzdHVkZW50cyBzdXJ2ZXllZCwgYmFycyBhcmUgMjV0aCBhbmQgNzV0aCBwZXJjZW50aWxlIiwKICAgICAgIHg9IiIsCiAgICAgICB5PSJNZWRpYW4gU2FsYXJ5IG9mIEdyYWR1YXRlcyIpCmBgYAojTG93ZXN0IGVhcm5pbmcgbWFqb3JzCmBgYHtyfQptYWpvcnNfcHJvY2Vzc2VkICU+JQogIHRhaWwoMjApICU+JQogIG11dGF0ZShNYWpvciA9IHN0cl90b190aXRsZShNYWpvciksIE1ham9yID0gZmN0X3Jlb3JkZXIoTWFqb3IsIE1lZGlhbikpICU+JQogIGdncGxvdChhZXMoTWFqb3IsTWVkaWFuLCBjb2xvciA9IE1ham9yX2NhdGVnb3J5KSkgKwogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV9lcnJvcmJhcihhZXMoeW1pbiA9IFAyNXRoLCB5bWF4ID0gUDc1dGgpKSArCiAgY29vcmRfZmxpcCgpCmBgYAoKYGBge3J9Cm1ham9yc19wcm9jZXNzZWQgJT4lCiAgZ2dwbG90KGFlcyhTYW1wbGVfc2l6ZSwgTWVkaWFuKSkgKwogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV90ZXh0KGFlcyhsYWJlbD1NYWpvciwgY2hlY2tfb3ZlcmxhcD1UUlVFLHZqdXN0PTEsIGhqdXN0PTEpKSArCiAgc2NhbGVfeF9sb2cxMCgpCmBgYAoKIyMjIE1vc3QgY29tbW9uIG1ham9ycwpgYGB7cn0KbWFqb3JzX3Byb2Nlc3NlZCAlPiUKICBtdXRhdGUoTWFqb3JfY2F0ZWdvcnkgPSBmY3RfcmVvcmRlcihNYWpvciwgVG90YWwpKSAlPiUKICBhcnJhbmdlKGRlc2MoVG90YWwpKSAlPiUKICBoZWFkKDIwKSAlPiUKICBnZ3Bsb3QoYWVzKE1ham9yX2NhdGVnb3J5LCBUb3RhbCwgZmlsbD1NYWpvcl9jYXRlZ29yeSkpICsKICBnZW9tX2NvbCgpICsKICBjb29yZF9mbGlwKCkgKwogIGxhYnMoeD0iIiwKICAgICAgIHkgPSAiVG90YWwgR3JhZHVhdGVzICMiKSArCiAgc2NhbGVfeV9jb250aW51b3VzKGxhYmVscyA9IGNvbW1hX2Zvcm1hdCgpKSArCiAgdGhlbWUobGVnZW5kLnBvc2l0aW9uPSJub25lIikKYGBgCiMjIyBIb3cgZG9lcyBnZW5kZXIgYnJlYWtkb3duIHJlbGF0ZSB0byBlYXJuaW5ncwoKYGBge3J9Cm1ham9yc19wcm9jZXNzZWQgJT4lCiAgYXJyYW5nZShkZXNjKFRvdGFsKSkgJT4lCiAgaGVhZCgyMCkgJT4lCiAgbXV0YXRlKE1ham9yID0gZmN0X3Jlb3JkZXIoTWFqb3IsVG90YWwpKSAlPiUKICBnYXRoZXIoR2VuZGVyLCBOdW1iZXIsIE1lbiwgV29tZW4pICU+JQogIHNlbGVjdChNYWpvciwgR2VuZGVyLCBOdW1iZXIpICU+JQogIGdncGxvdChhZXMoTWFqb3IsIE51bWJlciwgZmlsbCA9IEdlbmRlcikpICsKICBnZW9tX2NvbCgpICsKICBjb29yZF9mbGlwKCkKYGBgCgpgYGB7cn0KYnlfbWFqb3JfY2F0ZWdvcnkgPC0gbWFqb3JzX3Byb2Nlc3NlZCAlPiUKICBmaWx0ZXIoIWlzLm5hKFRvdGFsKSkgJT4lCiAgZ3JvdXBfYnkoTWFqb3JfY2F0ZWdvcnkpICU+JQogIHN1bW1hcml6ZShNZW4gPSBzdW0oTWVuKSwKICAgICAgICAgICAgV29tZW4gPXN1bShXb21lbiksCiAgICAgICAgICAgIFRvdGFsID0gc3VtKFRvdGFsKSwKICAgICAgICAgICAgTWVkaWFuU2FsYXJ5ID0gc3VtKCBNZWRpYW4gKiBTYW1wbGVfc2l6ZSkgLyBzdW0oU2FtcGxlX3NpemUpKSAlPiUKICBtdXRhdGUoU2hhcmVXb21lbiA9IFdvbWVuIC8gVG90YWwpICU+JQogIGFycmFuZ2UoZGVzYyhTaGFyZVdvbWVuKSkKYGBgCgpgYGB7cn0KbGlicmFyeShwbG90bHkpCmc8LSBieV9tYWpvcl9jYXRlZ29yeSAlPiUKICBnZ3Bsb3QoYWVzKFNoYXJlV29tZW4sIE1lZGlhblNhbGFyeSwgY29sb3I9TWFqb3JfY2F0ZWdvcnkpKSArCiAgZ2VvbV9wb2ludCgpICsKICBnZW9tX3Ntb290aChtZXRob2QgPSAibG0iKSArCiAgc2NhbGVfeF9jb250aW51b3VzKGxhYmVsID0gcGVyY2VudF9mb3JtYXQoKSkrCiAgc2NhbGVfeV9jb250aW51b3VzKGxhYmVsID0gZG9sbGFyX2Zvcm1hdCgpKSArCiAgZXhwYW5kX2xpbWl0cyh5PTApCgpnZ3Bsb3RseShnKQpgYGAKCmBgYHtyfQptYWpvcnNfcHJvY2Vzc2VkICU+JQogIHNlbGVjdChNYWpvciwgVG90YWwsIFNoYXJlV29tZW4sIFNhbXBsZV9zaXplLCBNZWRpYW4pICU+JQogIGxtKE1lZGlhbiB+IFNoYXJlV29tZW4sIGRhdGEgPSAuLCB3ZWlnaHRzID0gU2FtcGxlX3NpemUpICU+JQogIHN1bW1hcnkoKQpgYGAKCmBgYHtyfQpsaWJyYXJ5KGJyb29tKQoKbWFqb3JzX3Byb2Nlc3NlZCAlPiUKICBzZWxlY3QoTWFqb3IsIE1ham9yX2NhdGVnb3J5LCBUb3RhbCwgU2hhcmVXb21lbiwgU2FtcGxlX3NpemUsIE1lZGlhbikgJT4lCiAgYWRkX2NvdW50KE1ham9yX2NhdGVnb3J5KSAlPiUKICBmaWx0ZXIobj49MTApICU+JQogIG5lc3QoLU1ham9yX2NhdGVnb3J5KSAlPiUKICBtdXRhdGUobW9kZWwgPSBtYXAoZGF0YSwgfmxtKE1lZGlhbiB+IFNoYXJlV29tZW4sIGRhdGEgPSAuLCB3ZWlnaHRzID0gU2FtcGxlX3NpemUpKSwKICAgICAgICAgdGlkaWVkID0gbWFwKG1vZGVsLHRpZHkpKSAlPiUKICB1bm5lc3QodGlkaWVkKSAlPiUKICBmaWx0ZXIodGVybSA9PSAiU2hhcmVXb21lbiIpICU+JQogIGFycmFuZ2UoZXN0aW1hdGUpICU+JQogIG11dGF0ZShmZHIgPSBwLmFkanVzdChwLnZhbHVlLCBtZXRob2QgPSAiZmRyIikpCmBgYAoK